Simulation for comparing SSH, SSH-SSL, SSH-FL and Seurat, using both Louvain Clustering method and SC3 clustering

Using the R Package Splatter, which uses a Gamma-Poisson model, I simulated scRNA-seq data consisting of 200 cells x 2000 genes 

5 groups (clusters), with probabilities of 0.7,0.1,0.02,0.15 and 0.03, mimicking the population percentages seen in Tabula Sapiens data  Mimicks the rare cell populations seen in real data.

Clustering performance are evaluated using Adjusted Rand Index, which compares predicted clusters with true labels.

##function for SSH 
ari_ssh <- c() 
ari_seurat <- c() 
ari_feastfl <- c() 
ari_sparse <- c() 
ari_scm <- c() 
ari_sc3_ssl <- c() 
ari_sc3_fl <- c() 
ari_sc3_seurat <- c() 
ari_sc3_lasso <- c() 
ari_sc3_scm <- c() 

library(splatter)
library(SCMarker)
library(FEAST)
compare_met <- function(nreps, nogenes, nocells, sc3 = FALSE) { 
for (i in 1:nreps) { 
sim.groups3 <- splatSimulate(nGenes=nogenes, batchCells=nocells, group.prob = c(0.7,0.1,0.02,0.15,0.03), method = "groups",
                             verbose = FALSE) 

Y <- counts(sim.groups3) 

Y <- as.matrix(Y)
library(dplyr)
tot_med <- colSums(Y) %>%
  median()
norm1 <- function(vec) {
  vec/sum(vec)
}
Y <- apply(Y, 2, norm1)
Y <- Y * tot_med
Y <- Y + 0.1
Y <- t(Y) 

Y <- log2(Y - min(Y) + 1)
tb <- tibble(cell = rownames(Y)) %>%
  bind_cols(as_tibble(Y)) 

t <- as.matrix(tb[,-1])
rownames(t) <- tb$cell

# center for each feature
t <- scale(t, scale = FALSE)

# scale so that the overall variance is unit 
t <- t/sd(as.double(t)) 

nperms = 3 
lambda1 <- 0.0001
lambda0s <- seq(0.001, 1000, length = 100)
# 
# # this is the most costly computation in RECOMBINE
#set.seed(1)   ##important!! 
#library(recombine)
out3 <- SHC_SSL_gapstat(x = t,
                        nperms = nperms,
                        lambda0s = lambda0s,
                        lambda1 = lambda1)


w2 <- out3$result$w

names(w2) <- colnames(t)

w2 <- sort(w2, decreasing = TRUE)
w2_nonzero <- w2[w2 > 0]

names <- attr(w2_nonzero, "names")
idxssl3 <- match(names, colnames(t))

data <- counts(sim.groups3) 
data <- CreateSeuratObject(counts = data,
                           min.features = 0,
                           min.cells = 0)
data <- NormalizeData(data)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
sobj <- ScaleData(data, features = rownames(data)) 
# Perform linear dimensional reduction

##For SHC-SSL - 
sobj <- RunPCA(sobj, features = rownames(data)[idxssl3]) 

sobj <- FindNeighbors(sobj, dims=1:20)  
sobj <- FindClusters(sobj) 

trueclass <- colData(sim.groups3)$Group  
ari_ssh[i] <- eval_Cluster(sobj@meta.data$seurat_clusters, trueclass)["ARI"]

####compare seurat 
data <- counts(sim.groups3)
library(Seurat)
data2 <- CreateSeuratObject(counts = data,
                            project = "seurat",
                            min.features = 0,
                            min.cells = 0)
data2 <- FindVariableFeatures(data2, selection.method = "vst", nfeatures = length(idxssl3)) 

all.genes <- rownames(data2) 
data2 <- ScaleData(data2, features=all.genes)
# set variable genes as all discriminant markers
##
data2 <- RunPCA(data2, features=VariableFeatures(data2)) 

data2 <- FindNeighbors(data2, dims=1:20)  
data2 <- FindClusters(data2) #resolution = 1.2) #resolution = 1.08)  #0.76 
trueclass <- colData(sim.groups3)$Group  
ari_seurat[i] <- eval_Cluster(data2@meta.data$seurat_clusters, trueclass)["ARI"]

#LASSO sparse 
shc.permute.out2 <- sparcl::HierarchicalSparseCluster.permute(x = t,
                                                              nperms = 3,
                                                              wbounds = seq(1.1, 10, len = 100))
shc.out2 <- sparcl::HierarchicalSparseCluster(x = t,
                                              wbound = shc.permute.out2$bestw)

w <- shc.out2$ws
names(w) <- colnames(t)
w <- sort(w, decreasing = TRUE)
w_nonzero <- w[w > 0]

names <- attr(w_nonzero, "names")
idxlasso2 <- match(names, colnames(t))

data3 <- counts(sim.groups3) 
data3 <- CreateSeuratObject(counts = data3,
                            min.features = 0,
                            min.cells = 0)
data3 <- NormalizeData(data3)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
data3 <- ScaleData(data3, features = rownames(data3))
# Perform linear dimensional reduction

data3 <- RunPCA(data3, features = rownames(data3)[idxlasso2]) 

pca <- data3[['pca']]  

data3 <- FindNeighbors(data3, dims=1:min(length(pca@stdev),20))   

data3 <- FindClusters(data3) #resolution = 0.76)  
trueclass <- colData(sim.groups3)$Group  
ari_sparse[i] <- eval_Cluster(data3@meta.data$seurat_clusters, trueclass)["ARI"]

#FEAST - FL 
Y <- counts(sim.groups3)
#Y = process_Y(Y) 
ixs = FEAST(Y, k=5)
t2 <- t[, ixs] 

out4 <- SHC_FL_gapstat(t2,
                       lambda1s = seq(0, 0.5*max(nrow(t), ncol(t)), length = 5),
                       lambda2s = seq(1, 10*max(nrow(t), ncol(t)), length = 5),
                       nperms = 3)

w2 <- out4$result$w

names(w2) <- colnames(t2)
w2 <- sort(w2, decreasing = TRUE)
w2_nonzero <- w2[w2 > 0]

names <- attr(w2_nonzero, "names")
idxssl4 <- match(names, colnames(t2))

data4 <- counts(sim.groups3) 
data4 <- CreateSeuratObject(counts = data4,
                           project = "fl",
                           min.features = 0,
                           min.cells = 0)
data4 <- NormalizeData(data4)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
data4 <- ScaleData(data4, features = rownames(data4)) 
# Perform linear dimensional reduction

##For SHC-SSL - 
data4 <- RunPCA(data4, features = rownames(data4)[ixs][idxssl4]) 

data4 <- FindNeighbors(data4, dims=1:20)  
data4 <- FindClusters(data4) 

trueclass <- colData(sim.groups3)$Group 
ari_feastfl[i] <- eval_Cluster(data4@meta.data$seurat_clusters, trueclass)["ARI"]

#SC-Marker 
Y = process_Y(Y) 
res=ModalFilter(data=Y,geneK=10,cellK=10,width=2)# default widt
res=GeneFilter(obj=res)
res=getMarker(obj=res,k=300,n=30)  
head(res$marker)
idxssl6<-match(res$marker, rownames(Y)) 
data6 <- counts(sim.groups3) 
data6 <- CreateSeuratObject(counts = data6,
                            min.features = 0,
                            min.cells = 0)
data6 <- NormalizeData(data6)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
data6 <- ScaleData(data6, features = rownames(data6)) 
# Perform linear dimensional reduction

data6 <- RunPCA(data6, features = rownames(data6)[idxssl6]) 

data6 <- FindNeighbors(data6, dims=1:20)  
data6 <- FindClusters(data6) 

trueclass <- colData(sim.groups3)$Group 
ari_scm[i] <- eval_Cluster(data6@meta.data$seurat_clusters, trueclass)["ARI"]

if (sc3 == TRUE) {
  sobj2 <- counts(sim.groups3)
  #library(FEAST)
  sc3_reslv <- SC3_Clust(sobj2, k=5, input_markers=VariableFeatures(data2)) 
  trueclass <- colData(sim.groups3)$Group  
  ari_sc3_seurat[i] <- eval_Cluster(sc3_reslv$cluster, trueclass)["ARI"]
  
  sc3_reslasso <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[idxlasso2]) 
  trueclass <- colData(sim.groups3)$Group  
  ari_sc3_lasso[i] <- eval_Cluster(sc3_reslasso$cluster, trueclass)["ARI"]
  
  sc3_res <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[idxssl3]) 
  trueclass <- colData(sim.groups3)$Group  
  ari_sc3_ssl[i] <- eval_Cluster(sc3_res$cluster, trueclass)["ARI"]
  
  sc3_resfl <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[ixs][idxssl4]) 
  trueclass <- colData(sim.groups3)$Group  
  ari_sc3_fl[i] <- eval_Cluster(sc3_resfl$cluster, trueclass)["ARI"]
  
  sc3_resscm <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[idxssl6]) 
  ari_sc3_scm[i] <- eval_Cluster(sc3_resscm$cluster, trueclass)["ARI"]
    } 
} 
results <- list(ari_ssh, ari_seurat, ari_feastfl, ari_sparse, ari_scm, ari_sc3_seurat, ari_sc3_lasso, ari_sc3_ssl, ari_sc3_fl, ari_sc3_scm)   
return(results) 
} 

#doing 10 simulations 
compare_met(100, 2000, 200, sc3 = TRUE) 

# 6 more 

Simulation results

Figure 1 : SHC-SSL and SHC-FL performed better than Seurat and SHC in clustering data with rare cell populations  (due to computation, this is not displayed here, please refer to PPT)

Figure 2: SHC-SSL and SHC-FL similarly performed better than SHC using SC3 clustering  (due to computation, this is not displayed here, please refer to PPT)

Comparing feature selection methods with real data - Tabula Sapiens human kidney cells data

Challenges - 7 cell types : Kidney epithelial cell, B cell, CD4 T cell, CD8 T cell, NK cell, Macropages and Endothelial cell  -> more than 8000 out of 9000 cells are kidney epithelial cells, Seurat performs poorly in clustering rare cell populations 

gap statistic

out <- readRDS("scRNA_shc_ssl_out.rds") 
plot(out$w_l0norm,
     out$gaps_mean,
     log = "x",
     xlab = "# Non-zero Features",
     ylab = "Gap Statistics",
     ylim = c(min(out$gaps_mean - out$gaps_se) - 0.0001,
              max(out$gaps_mean + out$gaps_se) + 0.0001),
     type = "l",
     lwd = 1)
arrows(x0 = out$w_l0norm,
       y0 = out$gaps_mean - out$gaps_se,
       x1 = out$w_l0norm,
       y1 = out$gaps_mean + out$gaps_se,
       code = 3, angle = 90, length = 0.02, lwd = 1)

selecting the non-zero weight features for recombine

w <- out$result$w
names(w) <- colnames(mt_expr)

w <- sort(w, decreasing = TRUE)
w_nonzero <- w[w > 0]

tb <- tibble(feature = names(w),
             w = w)
tb <- tb %>%
  mutate(i = 1:nrow(tb)) %>%
  mutate(label = ifelse(i <= 10, feature, ""))

options(ggrepel.max.overlaps = Inf)
ggline(tb, "i", "w",
            xlab = "feature index",
            label = "label",
            repel = TRUE,
            label.rectangle = TRUE,
            point.size = 0.1,
            plot_type = "p")

Seurat’s built in feature selection method (vst)

library(Seurat)
#normalize 
kidata <- NormalizeData(kidata)

#now, use blood1 for HVF selected by Seurat, and blood2 for ones selected by FEAST
kidata <- FindVariableFeatures(kidata, selection.method = "vst", nfeatures = 2000)

all.genes <- rownames(kidata) 
kidata <- ScaleData(kidata, features=all.genes)
#if using FEAST should input selected features here 
kidata1 <- RunPCA(kidata, features=VariableFeatures(object=kidata))
ElbowPlot(kidata1) #use 18 PCs 
kidata1 <- FindNeighbors(kidata1, dims=1:15)

kidata1 <- FindClusters(kidata1, resolution = 0.18) 
trueclass <- kidata@meta.data$free_annotation

evaluating the performance of Seurat’s built in feature selection method (vst)

library(FEAST)
ki_seurat<-eval_Cluster(kidata1@meta.data$seurat_clusters, trueclass) 

library(knitr)
kable(ki_seurat, caption = "Cluster Evaluation Metrics")
Cluster Evaluation Metrics
x
ARI 0.2996862
Purity 0.9539467
Jaccard 0.4672912
FM 0.6821978

UMAP (with true labels)

#set.seed(1)
#kidata1 <- RunUMAP(kidata1, dims=1:50) 
DimPlot(kidata1, reduction="umap")

# Step 1: Add true class and clustering labels to the metadata
kidata1$trueclass <- kidata1@meta.data$free_annotation

# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata1, "umap"))  # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2")             # Rename columns
umap_data$seurat_clusters <- kidata1$seurat_clusters      # Add cluster labels
umap_data$trueclass <- kidata1$trueclass                  # Add true class labels

# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
  geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
  geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
  labs(
    color = "Cluster", 
    x = "UMAP 1",
    y = "UMAP 2",
    title = "UMAP: Clustering Results with True Class Overlay"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",          # Adjust legend position
    plot.title = element_text(hjust = 0.5)  # Center the title
  )

Sparse Hierarchial Clustering (SHC)

#normalize 
Y1 <- as.matrix(Y1)

library(dplyr)
tot_med <- colSums(Y1) %>%
  median()
norm1 <- function(vec) {
  vec/sum(vec)
}
Y1 <- apply(Y1, 2, norm1)
Y1 <- Y1 * tot_med
Y1 <- Y1 + 0.1
Y1 <- t(Y1) 

Y1 <- log2(Y1 - min(Y1) + 1)
tb <- tibble(cell = rownames(Y1)) %>%
  bind_cols(as_tibble(Y1)) 

t <- as.matrix(tb[,-1])
rownames(t) <- tb$cell

# center for each feature
t <- scale(t, scale = FALSE)

# scale so that the overall variance is unit 
t <- t/sd(as.double(t)) 
dim(t)
View(t)

shc.permute.out <- sparcl::HierarchicalSparseCluster.permute(x = t,
                                                             nperms = 3,
                                                             wbounds = seq(1.1, 10, len = 100))


w3 <- shc.permute.out$result$w

names(w3) <- colnames(t)

w3 <- sort(w3, decreasing = TRUE)
w3_nonzero <- w3[w3 > 0]

names <- attr(w3_nonzero, "names")
idxssl4 <- match(names, rownames(Y))
length(idxssl4) 
#data <- counts(sim.groups3) 
library(Seurat)

data <- CreateSeuratObject(counts = Y,
                           min.features = 0,
                           min.cells = 0)
data <- NormalizeData(data)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
sobj <- ScaleData(data, features = rownames(data)) 
# Perform linear dimensional reduction

##For SHC-SSL - 
kidata4 <- RunPCA(sobj, features = rownames(data)[idxssl4]) 

kidata4 <- FindNeighbors(kidata2, dims=1:20)  
kidata4 <- FindClusters(kidata2, resolution=0.16)

kidata4$trueclass <- kidata1@meta.data$free_annotation

ki_shc<-eval_Cluster(kidata4@meta.data$seurat_clusters, trueclass)
kidata4 <- FindNeighbors(kidata4, dims=1:15)
kidata4 <- FindClusters(kidata4, resolution = 0.12) 
trueclass <- kidata@meta.data$free_annotation

evaluating performance with SHC

library(FEAST)
ki_shc<-eval_Cluster(kidata4@meta.data$seurat_clusters, trueclass) 

library(knitr)
kable(ki_shc, caption = "Cluster Evaluation Metrics")
Cluster Evaluation Metrics
x
ARI 0.2218051
Purity 0.9497977
Jaccard 0.3731324
FM 0.6082610

UMAP (with true labels)

set.seed(1)
kidata4 <- RunUMAP(kidata4, dims=1:50) 
DimPlot(kidata4, reduction="umap") 

library(ggplot2)
# Step 1: Add true class and clustering labels to the metadata
kidata4$trueclass <- kidata1@meta.data$free_annotation

# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata4, "umap"))  # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2")             # Rename columns
umap_data$seurat_clusters <- kidata4$seurat_clusters      # Add cluster labels
umap_data$trueclass <- kidata4$trueclass                  # Add true class labels

# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
  geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
  geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
  labs(
    color = "Cluster", 
    x = "UMAP 1",
    y = "UMAP 2",
    title = "UMAP: Clustering Results with True Class Overlay"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",          # Adjust legend position
    plot.title = element_text(hjust = 0.5)  # Center the title
  )

RECOMBINE SHC-SSL algorithm

#normalize 
Y1 <- as.matrix(Y1)

library(dplyr)
tot_med <- colSums(Y1) %>%
  median()
norm1 <- function(vec) {
  vec/sum(vec)
}
Y1 <- apply(Y1, 2, norm1)
Y1 <- Y1 * tot_med
Y1 <- Y1 + 0.1
Y1 <- t(Y1) 

Y1 <- log2(Y1 - min(Y1) + 1)
tb <- tibble(cell = rownames(Y1)) %>%
  bind_cols(as_tibble(Y1)) 

t <- as.matrix(tb[,-1])
rownames(t) <- tb$cell

# center for each feature
t <- scale(t, scale = FALSE)

# scale so that the overall variance is unit 
t <- t/sd(as.double(t)) 
dim(t)
View(t)

nperms = 3 
lambda1 <- 0.0001
lambda0s <- seq(0.001, 1000, length = 100)
library(recombine) 
# 
# # this is the most costly computation in RECOMBINE
#set.seed(1)   ##important!! 
library(recombine)
out3 <- SHC_SSL_gapstat(x = t,
                        nperms = nperms,
                        lambda0s = lambda0s,
                        lambda1 = lambda1)


w2 <- out3$result$w

names(w2) <- colnames(t)

w2 <- sort(w2, decreasing = TRUE)
w2_nonzero <- w2[w2 > 0]

names <- attr(w2_nonzero, "names")
idxssl3 <- match(names, rownames(Y))
length(idxssl3)
#data <- counts(sim.groups3) 
library(Seurat)

data <- CreateSeuratObject(counts = Y,
                           min.features = 0,
                           min.cells = 0)
data <- NormalizeData(data)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
sobj <- ScaleData(data, features = rownames(data)) 
# Perform linear dimensional reduction

##For SHC-SSL - 
kidata2 <- RunPCA(sobj, features = rownames(data)[idxssl3]) 

kidata2 <- FindNeighbors(kidata2, dims=1:20)  
kidata2 <- FindClusters(kidata2, resolution=0.16)

kidata2$trueclass <- kidata1@meta.data$free_annotation

ki_recombine<-eval_Cluster(kidata2@meta.data$seurat_clusters, trueclass)

evaluating SHC-SSL performance

kable(ki_recombine, caption = "Cluster Evaluation Metrics")  
Cluster Evaluation Metrics
x
ARI 0.8176945
Purity 0.9533243
Jaccard 0.9011177
FM 0.9490565

UMAP (with true labels)

kidata3 <- RunUMAP(kidata3, features=rownames(data)[idxssl3]) 
DimPlot(kidata3, reduction="umap") 

# Step 1: Add true class and clustering labels to the metadata
kidata3$trueclass <- kidata1@meta.data$free_annotation

# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata3, "umap"))  # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2")             # Rename columns
umap_data$seurat_clusters <- kidata3$seurat_clusters      # Add cluster labels
umap_data$trueclass <- kidata3$trueclass                  # Add true class labels

# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
  geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
  geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
  labs(
    color = "Cluster", 
    x = "UMAP 1",
    y = "UMAP 2",
    title = "UMAP: Clustering Results with True Class Overlay"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",          # Adjust legend position
    plot.title = element_text(hjust = 0.5)  # Center the title
  ) 

SCMarker algorithm

library(SCMarker)
dim(Y) 
Y1<-log(Y+1) 

res=ModalFilter(data=Y1,geneK=10,cellK=10,width=2) 
res=GeneFilter(obj=res)
res=getMarker(obj=res,k=300,n=30) 
head(res$marker) 
length(res$marker) #938 

evaluating performance of SCMarker

kidata3 <- RunPCA(kidata2, features = res$marker) 

kidata3 <- FindNeighbors(kidata3, dims=1:20)  
kidata3 <- FindClusters(kidata3, resolution = 0.12)
kidata3$trueclass <- kidata1@meta.data$free_annotation
ki_scmarker<-eval_Cluster(kidata3@meta.data$seurat_clusters, kidata3$trueclass)
#kidata2$trueclass <- kidata1@meta.data$free_annotation
#ki_recombine<-eval_Cluster(kidata2@meta.data$seurat_clusters, trueclass) 
kable(ki_scmarker, caption = "Cluster Evaluation Metrics") 
Cluster Evaluation Metrics
x
ARI 0.8940461
Purity 0.9537392
Jaccard 0.9446944
FM 0.9718630

UMAP (with true labels)

#kidata2 <- RunUMAP(kidata2, features=genes) 
DimPlot(kidata2, reduction="umap") 

# Step 1: Add true class and clustering labels to the metadata
kidata2$trueclass <- kidata1@meta.data$free_annotation

# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata2, "umap"))  # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2")             # Rename columns
umap_data$seurat_clusters <- kidata2$seurat_clusters      # Add cluster labels
umap_data$trueclass <- kidata2$trueclass                  # Add true class labels

# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
  geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
  geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
  labs(
    color = "Cluster", 
    x = "UMAP 1",
    y = "UMAP 2",
    title = "UMAP: Clustering Results with True Class Overlay"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",          # Adjust legend position
    plot.title = element_text(hjust = 0.5)  # Center the title
  ) 

Compare the four methods in clustering accuracy

res_compare<-cbind(ki_seurat, ki_recombine, ki_scmarker, ki_shc) 
kable(res_compare, caption = "Cluster Evaluation Metrics") 
Cluster Evaluation Metrics
ki_seurat ki_recombine ki_scmarker ki_shc
ARI 0.2996862 0.8176945 0.8940461 0.2218051
Purity 0.9539467 0.9533243 0.9537392 0.9497977
Jaccard 0.4672912 0.9011177 0.9446944 0.3731324
FM 0.6821978 0.9490565 0.9718630 0.6082610

It is shown that the SHC-SSL feature selection algorithm outperforms both Seurat and traditional SHC, and it is comparable to the SCMarker algorithm.

---
title: Identifying rare cell populations and improving clustering accuracy using RECOMBINE algorithm
subtitle: Simulation and real data analysis 
author: Wanru Guo 
date: "Last compiled on `r format(Sys.time(), '%B %d, %Y')`"
output:
  html_notebook: 
    toc: yes
    toc_depth: 4
    toc_float: yes
---

# Simulation for comparing SSH, SSH-SSL, SSH-FL and Seurat, using both Louvain Clustering method and SC3 clustering 
Using the R Package Splatter, which uses a Gamma-Poisson model, I simulated scRNA-seq data consisting of 200 cells x 2000 genes 

5 groups (clusters), with probabilities of 0.7,0.1,0.02,0.15 and 0.03, mimicking the population percentages seen in Tabula Sapiens data 
Mimicks the rare cell populations seen in real data. 

Clustering performance are evaluated using Adjusted Rand Index, which compares predicted clusters with true labels. 

```{r}
##function for SSH 
ari_ssh <- c() 
ari_seurat <- c() 
ari_feastfl <- c() 
ari_sparse <- c() 
ari_scm <- c() 
ari_sc3_ssl <- c() 
ari_sc3_fl <- c() 
ari_sc3_seurat <- c() 
ari_sc3_lasso <- c() 
ari_sc3_scm <- c() 

library(splatter)
library(SCMarker)
library(FEAST)
compare_met <- function(nreps, nogenes, nocells, sc3 = FALSE) { 
for (i in 1:nreps) { 
sim.groups3 <- splatSimulate(nGenes=nogenes, batchCells=nocells, group.prob = c(0.7,0.1,0.02,0.15,0.03), method = "groups",
                             verbose = FALSE) 

Y <- counts(sim.groups3) 

Y <- as.matrix(Y)
library(dplyr)
tot_med <- colSums(Y) %>%
  median()
norm1 <- function(vec) {
  vec/sum(vec)
}
Y <- apply(Y, 2, norm1)
Y <- Y * tot_med
Y <- Y + 0.1
Y <- t(Y) 

Y <- log2(Y - min(Y) + 1)
tb <- tibble(cell = rownames(Y)) %>%
  bind_cols(as_tibble(Y)) 

t <- as.matrix(tb[,-1])
rownames(t) <- tb$cell

# center for each feature
t <- scale(t, scale = FALSE)

# scale so that the overall variance is unit 
t <- t/sd(as.double(t)) 

nperms = 3 
lambda1 <- 0.0001
lambda0s <- seq(0.001, 1000, length = 100)
# 
# # this is the most costly computation in RECOMBINE
#set.seed(1)   ##important!! 
#library(recombine)
out3 <- SHC_SSL_gapstat(x = t,
                        nperms = nperms,
                        lambda0s = lambda0s,
                        lambda1 = lambda1)


w2 <- out3$result$w

names(w2) <- colnames(t)

w2 <- sort(w2, decreasing = TRUE)
w2_nonzero <- w2[w2 > 0]

names <- attr(w2_nonzero, "names")
idxssl3 <- match(names, colnames(t))

data <- counts(sim.groups3) 
data <- CreateSeuratObject(counts = data,
                           min.features = 0,
                           min.cells = 0)
data <- NormalizeData(data)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
sobj <- ScaleData(data, features = rownames(data)) 
# Perform linear dimensional reduction

##For SHC-SSL - 
sobj <- RunPCA(sobj, features = rownames(data)[idxssl3]) 

sobj <- FindNeighbors(sobj, dims=1:20)  
sobj <- FindClusters(sobj) 

trueclass <- colData(sim.groups3)$Group  
ari_ssh[i] <- eval_Cluster(sobj@meta.data$seurat_clusters, trueclass)["ARI"]

####compare seurat 
data <- counts(sim.groups3)
library(Seurat)
data2 <- CreateSeuratObject(counts = data,
                            project = "seurat",
                            min.features = 0,
                            min.cells = 0)
data2 <- FindVariableFeatures(data2, selection.method = "vst", nfeatures = length(idxssl3)) 

all.genes <- rownames(data2) 
data2 <- ScaleData(data2, features=all.genes)
# set variable genes as all discriminant markers
##
data2 <- RunPCA(data2, features=VariableFeatures(data2)) 

data2 <- FindNeighbors(data2, dims=1:20)  
data2 <- FindClusters(data2) #resolution = 1.2) #resolution = 1.08)  #0.76 
trueclass <- colData(sim.groups3)$Group  
ari_seurat[i] <- eval_Cluster(data2@meta.data$seurat_clusters, trueclass)["ARI"]

#LASSO sparse 
shc.permute.out2 <- sparcl::HierarchicalSparseCluster.permute(x = t,
                                                              nperms = 3,
                                                              wbounds = seq(1.1, 10, len = 100))
shc.out2 <- sparcl::HierarchicalSparseCluster(x = t,
                                              wbound = shc.permute.out2$bestw)

w <- shc.out2$ws
names(w) <- colnames(t)
w <- sort(w, decreasing = TRUE)
w_nonzero <- w[w > 0]

names <- attr(w_nonzero, "names")
idxlasso2 <- match(names, colnames(t))

data3 <- counts(sim.groups3) 
data3 <- CreateSeuratObject(counts = data3,
                            min.features = 0,
                            min.cells = 0)
data3 <- NormalizeData(data3)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
data3 <- ScaleData(data3, features = rownames(data3))
# Perform linear dimensional reduction

data3 <- RunPCA(data3, features = rownames(data3)[idxlasso2]) 

pca <- data3[['pca']]  

data3 <- FindNeighbors(data3, dims=1:min(length(pca@stdev),20))   

data3 <- FindClusters(data3) #resolution = 0.76)  
trueclass <- colData(sim.groups3)$Group  
ari_sparse[i] <- eval_Cluster(data3@meta.data$seurat_clusters, trueclass)["ARI"]

#FEAST - FL 
Y <- counts(sim.groups3)
#Y = process_Y(Y) 
ixs = FEAST(Y, k=5)
t2 <- t[, ixs] 

out4 <- SHC_FL_gapstat(t2,
                       lambda1s = seq(0, 0.5*max(nrow(t), ncol(t)), length = 5),
                       lambda2s = seq(1, 10*max(nrow(t), ncol(t)), length = 5),
                       nperms = 3)

w2 <- out4$result$w

names(w2) <- colnames(t2)
w2 <- sort(w2, decreasing = TRUE)
w2_nonzero <- w2[w2 > 0]

names <- attr(w2_nonzero, "names")
idxssl4 <- match(names, colnames(t2))

data4 <- counts(sim.groups3) 
data4 <- CreateSeuratObject(counts = data4,
                           project = "fl",
                           min.features = 0,
                           min.cells = 0)
data4 <- NormalizeData(data4)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
data4 <- ScaleData(data4, features = rownames(data4)) 
# Perform linear dimensional reduction

##For SHC-SSL - 
data4 <- RunPCA(data4, features = rownames(data4)[ixs][idxssl4]) 

data4 <- FindNeighbors(data4, dims=1:20)  
data4 <- FindClusters(data4) 

trueclass <- colData(sim.groups3)$Group 
ari_feastfl[i] <- eval_Cluster(data4@meta.data$seurat_clusters, trueclass)["ARI"]

#SC-Marker 
Y = process_Y(Y) 
res=ModalFilter(data=Y,geneK=10,cellK=10,width=2)# default widt
res=GeneFilter(obj=res)
res=getMarker(obj=res,k=300,n=30)  
head(res$marker)
idxssl6<-match(res$marker, rownames(Y)) 
data6 <- counts(sim.groups3) 
data6 <- CreateSeuratObject(counts = data6,
                            min.features = 0,
                            min.cells = 0)
data6 <- NormalizeData(data6)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
data6 <- ScaleData(data6, features = rownames(data6)) 
# Perform linear dimensional reduction

data6 <- RunPCA(data6, features = rownames(data6)[idxssl6]) 

data6 <- FindNeighbors(data6, dims=1:20)  
data6 <- FindClusters(data6) 

trueclass <- colData(sim.groups3)$Group 
ari_scm[i] <- eval_Cluster(data6@meta.data$seurat_clusters, trueclass)["ARI"]

if (sc3 == TRUE) {
  sobj2 <- counts(sim.groups3)
  #library(FEAST)
  sc3_reslv <- SC3_Clust(sobj2, k=5, input_markers=VariableFeatures(data2)) 
  trueclass <- colData(sim.groups3)$Group  
  ari_sc3_seurat[i] <- eval_Cluster(sc3_reslv$cluster, trueclass)["ARI"]
  
  sc3_reslasso <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[idxlasso2]) 
  trueclass <- colData(sim.groups3)$Group  
  ari_sc3_lasso[i] <- eval_Cluster(sc3_reslasso$cluster, trueclass)["ARI"]
  
  sc3_res <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[idxssl3]) 
  trueclass <- colData(sim.groups3)$Group  
  ari_sc3_ssl[i] <- eval_Cluster(sc3_res$cluster, trueclass)["ARI"]
  
  sc3_resfl <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[ixs][idxssl4]) 
  trueclass <- colData(sim.groups3)$Group  
  ari_sc3_fl[i] <- eval_Cluster(sc3_resfl$cluster, trueclass)["ARI"]
  
  sc3_resscm <- SC3_Clust(sobj2, k=5, input_markers=rownames(sobj2)[idxssl6]) 
  ari_sc3_scm[i] <- eval_Cluster(sc3_resscm$cluster, trueclass)["ARI"]
    } 
} 
results <- list(ari_ssh, ari_seurat, ari_feastfl, ari_sparse, ari_scm, ari_sc3_seurat, ari_sc3_lasso, ari_sc3_ssl, ari_sc3_fl, ari_sc3_scm)   
return(results) 
} 

#doing 10 simulations 
compare_met(100, 2000, 200, sc3 = TRUE) 

# 6 more 

```

## Simulation results 
Figure 1 : SHC-SSL and SHC-FL performed better than Seurat and SHC in clustering data with rare cell populations  (due to computation, this is not displayed here, please refer to PPT) 

Figure 2: SHC-SSL and SHC-FL similarly performed better than SHC using SC3 clustering  (due to computation, this is not displayed here, please refer to PPT) 


# Comparing feature selection methods with real data - Tabula Sapiens human kidney cells data 

Challenges - 
7 cell types : Kidney epithelial cell, B cell, CD4 T cell, CD8 T cell, NK cell, Macropages and Endothelial cell 
-> more than 8000 out of 9000 cells are kidney epithelial cells, Seurat performs poorly in clustering rare cell populations 

gap statistic 

```{r}
out <- readRDS("scRNA_shc_ssl_out.rds") 
plot(out$w_l0norm,
     out$gaps_mean,
     log = "x",
     xlab = "# Non-zero Features",
     ylab = "Gap Statistics",
     ylim = c(min(out$gaps_mean - out$gaps_se) - 0.0001,
              max(out$gaps_mean + out$gaps_se) + 0.0001),
     type = "l",
     lwd = 1)
arrows(x0 = out$w_l0norm,
       y0 = out$gaps_mean - out$gaps_se,
       x1 = out$w_l0norm,
       y1 = out$gaps_mean + out$gaps_se,
       code = 3, angle = 90, length = 0.02, lwd = 1)
```

selecting the non-zero weight features for recombine 
```{r}
w <- out$result$w
names(w) <- colnames(mt_expr)

w <- sort(w, decreasing = TRUE)
w_nonzero <- w[w > 0]

tb <- tibble(feature = names(w),
             w = w)
tb <- tb %>%
  mutate(i = 1:nrow(tb)) %>%
  mutate(label = ifelse(i <= 10, feature, ""))

options(ggrepel.max.overlaps = Inf)
ggline(tb, "i", "w",
            xlab = "feature index",
            label = "label",
            repel = TRUE,
            label.rectangle = TRUE,
            point.size = 0.1,
            plot_type = "p")
```

## Seurat's built in feature selection method (vst)
```{r}
library(Seurat)
#normalize 
kidata <- NormalizeData(kidata)

#now, use blood1 for HVF selected by Seurat, and blood2 for ones selected by FEAST
kidata <- FindVariableFeatures(kidata, selection.method = "vst", nfeatures = 2000)

all.genes <- rownames(kidata) 
kidata <- ScaleData(kidata, features=all.genes)
#if using FEAST should input selected features here 
kidata1 <- RunPCA(kidata, features=VariableFeatures(object=kidata))
```

```{r}
ElbowPlot(kidata1) #use 18 PCs 
```

```{r}
kidata1 <- FindNeighbors(kidata1, dims=1:15)

kidata1 <- FindClusters(kidata1, resolution = 0.18) 
trueclass <- kidata@meta.data$free_annotation
```

### evaluating the performance of Seurat's built in feature selection method (vst)
```{r}
library(FEAST)
ki_seurat<-eval_Cluster(kidata1@meta.data$seurat_clusters, trueclass) 

library(knitr)
kable(ki_seurat, caption = "Cluster Evaluation Metrics")
```
### UMAP (with true labels) 
```{r}
set.seed(1)
kidata1 <- RunUMAP(kidata1, dims=1:50) 
DimPlot(kidata1, reduction="umap")
```


```{r}
# Step 1: Add true class and clustering labels to the metadata
kidata1$trueclass <- kidata1@meta.data$free_annotation

# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata1, "umap"))  # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2")             # Rename columns
umap_data$seurat_clusters <- kidata1$seurat_clusters      # Add cluster labels
umap_data$trueclass <- kidata1$trueclass                  # Add true class labels

# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
  geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
  geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
  labs(
    color = "Cluster", 
    x = "UMAP 1",
    y = "UMAP 2",
    title = "UMAP: Clustering Results with True Class Overlay"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",          # Adjust legend position
    plot.title = element_text(hjust = 0.5)  # Center the title
  )
```



## Sparse Hierarchial Clustering (SHC) 
```{r}
#normalize 
Y1 <- as.matrix(Y1)

library(dplyr)
tot_med <- colSums(Y1) %>%
  median()
norm1 <- function(vec) {
  vec/sum(vec)
}
Y1 <- apply(Y1, 2, norm1)
Y1 <- Y1 * tot_med
Y1 <- Y1 + 0.1
Y1 <- t(Y1) 

Y1 <- log2(Y1 - min(Y1) + 1)
tb <- tibble(cell = rownames(Y1)) %>%
  bind_cols(as_tibble(Y1)) 

t <- as.matrix(tb[,-1])
rownames(t) <- tb$cell

# center for each feature
t <- scale(t, scale = FALSE)

# scale so that the overall variance is unit 
t <- t/sd(as.double(t)) 
dim(t)
View(t)

shc.permute.out <- sparcl::HierarchicalSparseCluster.permute(x = t,
                                                             nperms = 3,
                                                             wbounds = seq(1.1, 10, len = 100))


w3 <- shc.permute.out$result$w

names(w3) <- colnames(t)

w3 <- sort(w3, decreasing = TRUE)
w3_nonzero <- w3[w3 > 0]

names <- attr(w3_nonzero, "names")
idxssl4 <- match(names, rownames(Y))
length(idxssl4) 
#data <- counts(sim.groups3) 
library(Seurat)

data <- CreateSeuratObject(counts = Y,
                           min.features = 0,
                           min.cells = 0)
data <- NormalizeData(data)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
sobj <- ScaleData(data, features = rownames(data)) 
# Perform linear dimensional reduction

##For SHC-SSL - 
kidata4 <- RunPCA(sobj, features = rownames(data)[idxssl4]) 

kidata4 <- FindNeighbors(kidata2, dims=1:20)  
kidata4 <- FindClusters(kidata2, resolution=0.16)

kidata4$trueclass <- kidata1@meta.data$free_annotation

ki_shc<-eval_Cluster(kidata4@meta.data$seurat_clusters, trueclass)
```

```{r}
kidata4 <- FindNeighbors(kidata4, dims=1:15)
```

```{r}
kidata4 <- FindClusters(kidata4, resolution = 0.12) 
trueclass <- kidata@meta.data$free_annotation
```

### evaluating performance with SHC 
```{r}
library(FEAST)
ki_shc<-eval_Cluster(kidata4@meta.data$seurat_clusters, trueclass) 

library(knitr)
kable(ki_shc, caption = "Cluster Evaluation Metrics")
```

### UMAP (with true labels) 
```{r}
set.seed(1)
kidata4 <- RunUMAP(kidata4, dims=1:50) 
```

```{r}
DimPlot(kidata4, reduction="umap") 
```


```{r}
library(ggplot2)
# Step 1: Add true class and clustering labels to the metadata
kidata4$trueclass <- kidata1@meta.data$free_annotation

# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata4, "umap"))  # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2")             # Rename columns
umap_data$seurat_clusters <- kidata4$seurat_clusters      # Add cluster labels
umap_data$trueclass <- kidata4$trueclass                  # Add true class labels

# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
  geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
  geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
  labs(
    color = "Cluster", 
    x = "UMAP 1",
    y = "UMAP 2",
    title = "UMAP: Clustering Results with True Class Overlay"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",          # Adjust legend position
    plot.title = element_text(hjust = 0.5)  # Center the title
  )
```


## RECOMBINE SHC-SSL algorithm 
```{r}
#normalize 
Y1 <- as.matrix(Y1)

library(dplyr)
tot_med <- colSums(Y1) %>%
  median()
norm1 <- function(vec) {
  vec/sum(vec)
}
Y1 <- apply(Y1, 2, norm1)
Y1 <- Y1 * tot_med
Y1 <- Y1 + 0.1
Y1 <- t(Y1) 

Y1 <- log2(Y1 - min(Y1) + 1)
tb <- tibble(cell = rownames(Y1)) %>%
  bind_cols(as_tibble(Y1)) 

t <- as.matrix(tb[,-1])
rownames(t) <- tb$cell

# center for each feature
t <- scale(t, scale = FALSE)

# scale so that the overall variance is unit 
t <- t/sd(as.double(t)) 
dim(t)
View(t)

nperms = 3 
lambda1 <- 0.0001
lambda0s <- seq(0.001, 1000, length = 100)
library(recombine) 
# 
# # this is the most costly computation in RECOMBINE
#set.seed(1)   ##important!! 
library(recombine)
out3 <- SHC_SSL_gapstat(x = t,
                        nperms = nperms,
                        lambda0s = lambda0s,
                        lambda1 = lambda1)


w2 <- out3$result$w

names(w2) <- colnames(t)

w2 <- sort(w2, decreasing = TRUE)
w2_nonzero <- w2[w2 > 0]

names <- attr(w2_nonzero, "names")
idxssl3 <- match(names, rownames(Y))
length(idxssl3)
#data <- counts(sim.groups3) 
library(Seurat)

data <- CreateSeuratObject(counts = Y,
                           min.features = 0,
                           min.cells = 0)
data <- NormalizeData(data)
# set variable genes as all discriminant markers
#VariableFeatures(data) <- rownames(data)
# Scaling the data
sobj <- ScaleData(data, features = rownames(data)) 
# Perform linear dimensional reduction

##For SHC-SSL - 
kidata2 <- RunPCA(sobj, features = rownames(data)[idxssl3]) 

kidata2 <- FindNeighbors(kidata2, dims=1:20)  
kidata2 <- FindClusters(kidata2, resolution=0.16)

kidata2$trueclass <- kidata1@meta.data$free_annotation

ki_recombine<-eval_Cluster(kidata2@meta.data$seurat_clusters, trueclass)
```

### evaluating SHC-SSL performance 

```{r}
kable(ki_recombine, caption = "Cluster Evaluation Metrics")  
```
### UMAP (with true labels) 

```{r}
kidata3 <- RunUMAP(kidata3, features=rownames(data)[idxssl3]) 
```

```{r}
DimPlot(kidata3, reduction="umap") 
```


```{r}
# Step 1: Add true class and clustering labels to the metadata
kidata3$trueclass <- kidata1@meta.data$free_annotation

# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata3, "umap"))  # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2")             # Rename columns
umap_data$seurat_clusters <- kidata3$seurat_clusters      # Add cluster labels
umap_data$trueclass <- kidata3$trueclass                  # Add true class labels

# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
  geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
  geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
  labs(
    color = "Cluster", 
    x = "UMAP 1",
    y = "UMAP 2",
    title = "UMAP: Clustering Results with True Class Overlay"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",          # Adjust legend position
    plot.title = element_text(hjust = 0.5)  # Center the title
  ) 
```

## SCMarker algorithm 
```{r}
library(SCMarker)
dim(Y) 
Y1<-log(Y+1) 

res=ModalFilter(data=Y1,geneK=10,cellK=10,width=2) 
res=GeneFilter(obj=res)
res=getMarker(obj=res,k=300,n=30) 
head(res$marker) 
length(res$marker) #938 
```

### evaluating performance of SCMarker  
```{r}
kidata3 <- RunPCA(kidata2, features = res$marker) 

kidata3 <- FindNeighbors(kidata3, dims=1:20)  
kidata3 <- FindClusters(kidata3, resolution = 0.12)
kidata3$trueclass <- kidata1@meta.data$free_annotation
ki_scmarker<-eval_Cluster(kidata3@meta.data$seurat_clusters, kidata3$trueclass)
```

```{r}
#kidata2$trueclass <- kidata1@meta.data$free_annotation
#ki_recombine<-eval_Cluster(kidata2@meta.data$seurat_clusters, trueclass) 
kable(ki_scmarker, caption = "Cluster Evaluation Metrics") 
```

### UMAP (with true labels)
```{r}
#kidata2 <- RunUMAP(kidata2, features=genes) 
DimPlot(kidata2, reduction="umap") 
```

```{r}
# Step 1: Add true class and clustering labels to the metadata
kidata2$trueclass <- kidata1@meta.data$free_annotation

# Step 2: Extract UMAP coordinates and metadata
umap_data <- as.data.frame(Embeddings(kidata2, "umap"))  # Extract UMAP embeddings
colnames(umap_data) <- c("UMAP_1", "UMAP_2")             # Rename columns
umap_data$seurat_clusters <- kidata2$seurat_clusters      # Add cluster labels
umap_data$trueclass <- kidata2$trueclass                  # Add true class labels

# Step 3: Plot UMAP with overlayed true class labels
ggplot(umap_data, aes(x = UMAP_1, y = UMAP_2)) +
  geom_point(aes(color = as.factor(seurat_clusters)), alpha = 0.6, size = 2) + # Points for clusters
  geom_text(aes(label = trueclass), size = 3, vjust = -1, check_overlap = TRUE) + # Text overlay for true class
  labs(
    color = "Cluster", 
    x = "UMAP 1",
    y = "UMAP 2",
    title = "UMAP: Clustering Results with True Class Overlay"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",          # Adjust legend position
    plot.title = element_text(hjust = 0.5)  # Center the title
  ) 
```


## Compare the four methods in clustering accuracy 
```{r}
res_compare<-cbind(ki_seurat, ki_recombine, ki_scmarker, ki_shc) 
kable(res_compare, caption = "Cluster Evaluation Metrics") 
```

It is shown that the SHC-SSL feature selection algorithm outperforms both Seurat and traditional SHC, and it is comparable to the SCMarker algorithm. 




